{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from finance_ml.model_selection import PurgedKFold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n",
    "from sklearn.ensemble import BaggingClassifier\n",
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "class MyPipeline(Pipeline):\n",
    "    def fit(self, X, y, sample_weight=None, **fit_params):\n",
    "        if sample_weight is not None:\n",
    "            fit_params[self.steps[-1][0] + '__sample_weight'] = sample_weight\n",
    "        return super(MyPipeline, self).fit(X, y, **fit_params)\n",
    "\n",
    "\n",
    "def clf_hyper_fit(feat, label, t1, pipe_clf, search_params, scoring=None,\n",
    "                  n_splits=3, bagging=[0, None, 1.],\n",
    "                  rnd_search_iter=0, n_jobs=-1, pct_embargo=0., **fit_params):\n",
    "    # Set defaut value for scoring\n",
    "    if scoring is None:\n",
    "        if set(label.values) == {0, 1}:\n",
    "            scoring = 'f1'\n",
    "        else:\n",
    "            scoring = 'neg_log_loss'\n",
    "    # HP serach on traing data\n",
    "    inner_cv = PurgedKFold(n_splits=n_splits, t1=t1, pct_embargo=pct_embargo)\n",
    "    if rnd_search_iter == 0:\n",
    "        search = GridSearchCV(estimator=pipe_clf, param_grid=search_params,\n",
    "                              scoring=scoring, cv=inner_cv, n_jobs=n_jobs, iid=False)\n",
    "    else:\n",
    "        search = RandomizedSearchCV(estimator=pipe_clf, param_distributions=search_params,\n",
    "                                    scoring=scoring, cv=inner_cv, n_jobs=n_jobs, iid=False)\n",
    "    best_pipe = search.fit(feat, label, **fit_params).best_estimator_\n",
    "    # Fit validated model on the entirely of dawta\n",
    "    if bagging[0] > 0:\n",
    "        bag_est = BaggingClassifier(base_estimator=MyPipeline(best_pipe.steps),\n",
    "                                   n_estimators=int(bagging[0]), max_samples=float(bagging[1]),\n",
    "                                   max_features=float(bagging[2]), n_jobs=n_jobs)\n",
    "        bag_est = best_pipe.fit(feat, label,\n",
    "                                sample_weight=fit_params[bag_est.base_estimator.steps[-1][0] + '__sample_weight'])\n",
    "        best_pipe = Pipeline([('bag', bag_est)])\n",
    "    return best_pipe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.stats import rv_continuous\n",
    "\n",
    "\n",
    "class LogUniformGen(rv_continuous):\n",
    "    def _cdf(self, x):\n",
    "        return np.log(x / self.a) / np.log(self.b / self.a)\n",
    "    \n",
    "def log_uniform(a=1, b=np.exp(1)):\n",
    "    return LogUniformGen(a=a, b=b, name='log_uniform')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = 1e-3\n",
    "b = 1e3\n",
    "size = 10000\n",
    "vals = log_uniform(a=a, b=b).rvs(size=size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000,)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vals.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 9.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from finance_ml.datasets import get_cls_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                 I_0       I_1       I_2       I_3       I_4  \\\n",
      "1980-03-17 07:14:33.589988  2.105359  2.861661  0.104159  0.686149  1.369429   \n",
      "1980-03-18 07:14:33.589988 -0.330754  1.464379 -1.405119  0.396713 -1.722305   \n",
      "1980-03-19 07:14:33.589988 -0.461334 -0.160432 -2.169501 -0.137535  0.398229   \n",
      "1980-03-20 07:14:33.589988 -1.573667  3.110105  0.073939  1.232501  1.069429   \n",
      "1980-03-21 07:14:33.589988  0.528677  1.538982 -1.603758  2.056413  0.777722   \n",
      "\n",
      "                                 N_0       N_1       N_2       N_3       N_4  \n",
      "1980-03-17 07:14:33.589988 -0.868903 -1.297125 -0.160205 -0.481024  0.841338  \n",
      "1980-03-18 07:14:33.589988  0.471952 -1.443687 -0.433773  0.123114 -0.102970  \n",
      "1980-03-19 07:14:33.589988 -0.278979 -1.860566  0.909540 -0.396742  2.455228  \n",
      "1980-03-20 07:14:33.589988  0.700720 -1.097145  0.157145 -1.699373  1.167458  \n",
      "1980-03-21 07:14:33.589988 -0.644594 -0.304476  0.682256 -0.644368  0.280994  \n",
      "                            bin       w                         t1\n",
      "1980-03-17 07:14:33.589988    0  0.0001 1980-03-17 07:14:33.589988\n",
      "1980-03-18 07:14:33.589988    0  0.0001 1980-03-18 07:14:33.589988\n",
      "1980-03-19 07:14:33.589988    0  0.0001 1980-03-19 07:14:33.589988\n",
      "1980-03-20 07:14:33.589988    0  0.0001 1980-03-20 07:14:33.589988\n",
      "1980-03-21 07:14:33.589988    0  0.0001 1980-03-21 07:14:33.589988\n"
     ]
    }
   ],
   "source": [
    "X, label = get_cls_data(n_features=10, n_informative=5, n_redundant=0, n_samples=10000)\n",
    "print(X.head())\n",
    "print(label.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.svm import SVC\n",
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "name = 'svc'\n",
    "params_grid = {name + '__C': [1e-2, 1e-1, 1, 10, 100], name + '__gamma': [1e-2, 1e-1, 1, 10, 100]}\n",
    "kernel = 'rbf'\n",
    "clf = SVC(kernel=kernel, probability=True)\n",
    "pipe_clf = Pipeline([(name, clf)])\n",
    "fit_params = dict()\n",
    "\n",
    "clf = clf_hyper_fit(X, label['bin'], t1=label['t1'], pipe_clf=pipe_clf, scoring='neg_log_loss',\n",
    "                    search_params=params_grid, n_splits=3, bagging=[0, None, 1.],\n",
    "                    rnd_search_iter=0, n_jobs=-1, pct_embargo=0., **fit_params)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 9.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "name = 'svc'\n",
    "params_dist = {name + '__C': log_uniform(a=1e-2, b=1e2),\n",
    "               name + '__gamma': log_uniform(a=1e-2, b=1e2)}\n",
    "kernel = 'rbf'\n",
    "clf = SVC(kernel=kernel, probability=True)\n",
    "pipe_clf = Pipeline([(name, clf)])\n",
    "fit_params = dict()\n",
    "\n",
    "clf = clf_hyper_fit(X, label['bin'], t1=label['t1'], pipe_clf=pipe_clf, scoring='neg_log_loss',\n",
    "                    search_params=params_grid, n_splits=3, bagging=[0, None, 1.],\n",
    "                    rnd_search_iter=25, n_jobs=-1, pct_embargo=0., **fit_params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
